Scrape list of sub division (Neighborhoods) in a city¶

# Using beautiful soup to scrape data
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

Create a function to get the latitude and longitude of a place based on address¶

# use geocoder library, if not present use !conda install -c conda-forge geocoder
import geocoder
# Google API key is required for the geocoder library to work, save the API key in OS environment variables as GOOGLE_API_KEY
# and then access thay key here
import os
# Use BING_API_KEY when choosing to use bing geocoding instead of google geocoding.
BING_API_KEY = 'AksNN-3luSfNBssyZ3Ju4i78nIrFLt1UtYo--YWQj9oyfxSwyXkdsqykWk3FeTXB' # os.environ['BING_API_KEY']

# This function will take an adress and return the latlng of that adress
def get_latlng(address):
    # using bing geocoder API since it is better.
    g = geocoder.bing(address, key = BING_API_KEY)
    return pd.Series(g.latlng)

Create a function to return soup object from a url¶

# Function returns a soup object on the basis of URL
def get_soup_object(url):
    source_data = requests.get(url).text
    return BeautifulSoup(source_data,'lxml')

Ranchi¶

Step 1: initiliaze all the url and get the soup object from the website¶

# initialize url
rnc_data_url = 'http://vlist.in/district/364.html'
# use function to get soup object
soup = get_soup_object(rnc_data_url)
print('Soup object created')

village_url_header = 'http://vlist.in'
district_name = 'Ranchi'

Step 2: Extract the rows from the website data¶

# function extracts row from the table from government website. This will return the name in the table and the link associated with the name
def extract_row(table_row):
    table_row = table_row.find_all('td')
    
    index = table_row[0].text
    
    link = village_url_header + table_row[1].find('a')['href']
    
    name = table_row[1].text
    
    return link, name

# extracting the block rows
table_rows = soup.find_all('tr')
table_rows = table_rows[1:]
table_rows = table_rows[1:]
data = []
# for every block row all the villages will also be extracted
for table_row in table_rows:
    
    sub_district_link, block_name = extract_row(table_row)
    print(block_name)
    # getting the sub villages in block
    soup_village = get_soup_object(sub_district_link)
    # get all the table rows for individual villages in block
    sub_table_rows = soup_village.find_all('tr')
    sub_table_rows = sub_table_rows[1:]
    
    # extract individual village name and store it in data along with block name and district name
    for sub_table_row in sub_table_rows:
    
        sub_link, village_name = extract_row(sub_table_row)
        
        data.append([village_name, block_name, district_name])

print(data[0])

Step 3: Store data in dataframe¶

# save data in csv for future usage
header = ['Village','Block','District']
df = pd.DataFrame(data= data, columns= header)

df.head()

Step 4: Get latitude and longitude for subdivisions¶

# using the get_latlng function to define latitude and longitude columns of the data frame
df[['Latitude','Longitude']] = df.apply(lambda x: get_latlng(x.Village +', '+ x.Block + ', ' + x.District), axis=1)
df.head()

df.info()

df.dropna(inplace= True)
df.info()

Step 5: Store the data in a csv¶

# data will be used later
df.to_csv('ranchi_villages.csv')

Now Repeating all the above steps for Delhi, Mumbai, Kolkata and Chennai¶

Delhi¶

delhi_data_url = 'https://en.wikipedia.org/wiki/Neighbourhoods_of_Delhi'
# initialize soup object
soup = get_soup_object(delhi_data_url)
print('soup object created')

soup object created

# get the relevant rows
row_groups = soup.find_all('ul')
row_groups = row_groups[1:10]
row_items = []
for row_group in row_groups:
    rows = row_group.find_all('li')
    for row in rows:
        row_items.append([row.text,'Delhi'])
# print the number of neighborhood obtained
print(len(row_items))

185

# create a data frame
header = ['Neighborhood','City']
df = pd.DataFrame(data= row_items, columns= header)
df.tail()

# using the get_latlng function to define latitude and longitude columns of the data frame
df[['Latitude','Longitude']] = df.apply(lambda x: get_latlng(x.Neighborhood + ', ' + x.City), axis=1)
df.head()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185 entries, 0 to 184
Data columns (total 4 columns):
Neighborhood    185 non-null object
City            185 non-null object
Latitude        185 non-null float64
Longitude       185 non-null float64
dtypes: float64(2), object(2)
memory usage: 5.9+ KB

df.to_csv('delhi_subdiv.csv')

Chennai¶

# Initialize the url
chennai_data_url = 'https://en.wikipedia.org/wiki/List_of_neighbourhoods_of_Chennai'
# initialize soup object
soup = get_soup_object(chennai_data_url)
print('soup object created')

soup object created

# get the relevant rows
row_groups = soup.find_all('ul')
row_groups = row_groups[1:8]
row_items = []
for row_group in row_groups:
    rows = row_group.find_all('li')
    for row in rows:
        row_items.append([row.text,'Chennai'])
# print the number of neighborhood obtained
print(len(row_items))

181

# save data in csv for future usage
header = ['Neighborhood','City']
df = pd.DataFrame(data= row_items, columns= header)
df.head()

# using the get_latlng function to define latitude and longitude columns of the data frame
df[['Latitude','Longitude']] = df.apply(lambda x: get_latlng(x.Neighborhood + ', ' + x.City), axis=1)
df.head()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181 entries, 0 to 180
Data columns (total 4 columns):
Neighborhood    181 non-null object
City            181 non-null object
Latitude        181 non-null float64
Longitude       181 non-null float64
dtypes: float64(2), object(2)
memory usage: 5.7+ KB

df.to_csv('chennai_subdiv.csv')

Kolkata¶

# Initialize the url
kolkata_data_url = 'https://en.wikipedia.org/wiki/Neighbourhoods_in_Kolkata_Metropolitan_Area'
# initialize soup object
soup = get_soup_object(kolkata_data_url)
print('soup object created')

soup object created

# get the relevant rows
row_groups = soup.find_all('ul')
row_groups = row_groups[1:7]
row_items = []
for row_group in row_groups:
    rows = row_group.find_all('li')
    for row in rows:
        row_items.append([row.text,'Kolkata'])
        
print(len(row_items))

43

# save data in csv for future usage
header = ['Neighborhood','City']
df = pd.DataFrame(data= row_items, columns= header)
df.head()

# using the get_latlng function to define latitude and longitude columns of the data frame
df[['Latitude','Longitude']] = df.apply(lambda x: get_latlng(x.Neighborhood + ', ' + x.City), axis=1)
df.head()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43 entries, 0 to 42
Data columns (total 4 columns):
Neighborhood    43 non-null object
City            43 non-null object
Latitude        43 non-null float64
Longitude       43 non-null float64
dtypes: float64(2), object(2)
memory usage: 1.4+ KB

df.to_csv('kolkata_subdiv.csv')

df = pd.read_csv('kolkata_subdiv.csv',index_col = 0)

df.head()

Mumbai¶

# Initialize the url
mumbai_data_url = 'https://en.wikipedia.org/wiki/List_of_neighbourhoods_in_Mumbai'
# initialize soup object
soup = get_soup_object(mumbai_data_url)
print('soup object created')

soup object created

# get the relevant rows
row_groups = soup.find_all('ul')
row_groups = row_groups[5:36]
row_items = []
for row_group in row_groups:
    rows = row_group.find_all('li')
    for row in rows:
        row_items.append([row.text,'Mumbai'])
        
print(len(row_items))

122

# save data in csv for future usage
header = ['Neighborhood','City']
df = pd.DataFrame(data= row_items, columns= header)
df.head()

# using the get_latlng function to define latitude and longitude columns of the data frame
df[['Latitude','Longitude']] = df.apply(lambda x: get_latlng(x.Neighborhood + ', ' + x.City), axis=1)
df.head()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122 entries, 0 to 121
Data columns (total 4 columns):
Neighborhood    122 non-null object
City            122 non-null object
Latitude        122 non-null float64
Longitude       122 non-null float64
dtypes: float64(2), object(2)
memory usage: 3.9+ KB

df.to_csv('mumbai_subdiv.csv')

Visualize the locations of a city on a map.¶

#!conda install -c conda-forge folium --yes # uncomment this line if folium is missing
import folium

# Function takes in a data frame with Latitude, Longitude, Neighborhood and City columns and shows it on map
def visualize_area_in_map(data):
    # add markers to map
    for lat, lng, neighborhood, city in zip(data['Latitude'], data['Longitude'], data['Neighborhood'], data['City']):
        label = '{}, {}'.format(neighborhood, city)
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, lng],
            radius=2,
            popup=label,
            color='blue',
            fill=True,
            fill_color='#3186cc',
            fill_opacity=0.7,
            parse_html=False).add_to(map)  
    
    return map

Visualize Mumbai's Neighborhoods¶

city = 'Mumbai'
latitude, longitude = get_latlng(city)
print('Lat : ',latitude,' Long : ',longitude)

Lat :  18.940170288085938  Long :  72.8348617553711

# create map of Toronto using latitude and longitude values
map = folium.Map(location=[latitude, longitude], zoom_start=10)

# data to be used for map
data = df.dropna()

visualize_area_in_map(data)

	Neighborhood	City	Latitude	Longitude
0	Adarsh Nagar	Delhi	28.720341	77.172661
1	Ashok Vihar	Delhi	28.690420	77.176064
2	Azadpur	Delhi	28.712420	77.173111
3	Bawana	Delhi	28.797661	77.045258
4	Begum Pur	Delhi	28.732599	77.052170

	Neighborhood	City	Latitude	Longitude
0	Red Hills	Chennai	13.19543	80.184303
1	Ayanavaram	Chennai	13.09883	80.232384
2	Royapuram	Chennai	13.11396	80.294220
3	Korukkupet	Chennai	13.11680	80.277298
4	Vyasarpadi	Chennai	13.11778	80.251678

	Neighborhood	City	Latitude	Longitude
0	Kalyani Municipality	Kolkata	22.570539	88.371239
1	Gayespur Municipality	Kolkata	22.570539	88.371239
2	Kanchrapara Municipality	Kolkata	22.951059	88.431023
3	Halisahar Municipality	Kolkata	22.570539	88.371239
4	Naihati Municipality	Kolkata	22.895760	88.428757

	Neighborhood	City	Latitude	Longitude
0	Kalyani Municipality	Kolkata	22.570539	88.371239
1	Gayespur Municipality	Kolkata	22.570539	88.371239
2	Kanchrapara Municipality	Kolkata	22.951059	88.431023
3	Halisahar Municipality	Kolkata	22.570539	88.371239
4	Naihati Municipality	Kolkata	22.895760	88.428757

	Neighborhood	City	Latitude	Longitude
0	Amboli	Mumbai	19.129061	72.846451
1	Chakala	Mumbai	19.108360	72.862343
2	D.N. Nagar	Mumbai	19.124084	72.831375
3	Four Bungalows	Mumbai	19.126301	72.824318
4	JB Nagar	Mumbai	19.105770	72.864098

	Neighborhood	City
180	Tihar Village	Delhi
181	Tilak Nagar	Delhi
182	Uttam Nagar	Delhi
183	Vikas Nagar	Delhi
184	Vikaspuri	Delhi